library(tidyverse) # Colección de paquetes para ciencia de datos (incluye ggplot2, dplyr, tidyr, readr, purrr, tibble)
library(reshape) # Herramientas para reorganizar datos
library(Hmisc) # resúmenes estadísticos
library(limma) # Análisis de datos de expresión genética
library(AnnotationDbi) # Interfaz para bases de datos de anotaciones bioinformáticas
library(org.Hs.eg.db) # Datos de anotación para genes humanos
library(VennDiagram) # Generación de diagramas de Venn
library(gridExtra) # Mostrar varias gráficas
library(patchwork) # Combinar múltiples ggplots en un único plot
library(ggrepel) # Mejora la visualización de texto en ggplots evitando solapamientos de texto
library(Rtsne) # Implementación de t-SNE
library(umap) # Implementación de UMAP
library(ggVennDiagram) # diagrama de Venn
library(fastDummies) # Para realizar el one-hot encoding
library(reactable) # para tablas interactivas
library(factoextra)
library(ggstatsplot) # para barras de porcentajes de las tablas de contingencia
library(UpSetR) # para diagramas de upsetDescargar los datos. Escalo los datos de la matriz para tener homogeneidad en las representaciones.
# covariables
ROSMAP_RINPMIAGESEX_covs <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_covs.rds")
covs <- ROSMAP_RINPMIAGESEX_covs
rownames(covs) <- covs$mrna_id
covs$study <- as.factor(covs$study)
covs$projid <- as.character(covs$projid)
covs$ceradsc <- as.factor(covs$ceradsc)
covs$cogdx <- as.factor(covs$cogdx)
covs$neuroStatus <- as.factor(covs$neuroStatus)
#datos corregidos
ROSMAP_RINPMIAGESEX_resids <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_resids.rds")
mat_exp <- scale(ROSMAP_RINPMIAGESEX_resids)One-hot encoding de la matriz de covariables
covs2 <- data.frame(matrix(ncol = 0, nrow = nrow(covs))) # Hago un dataframe vacio para meter los datos procesados
for (colname in names(covs)) {
if (is.factor(covs[[colname]]) & length(levels(covs[[colname]])) > 2) {
dummy_df <- dummy_cols(covs[colname],
remove_selected_columns = TRUE) # quitar las variables iniciales
dummy_df <- data.frame(lapply(dummy_df, factor))
covs2 <- cbind(covs2, dummy_df)
} else {
covs2[[colname]] <- covs[[colname]] # si son numéricas o de otra clase, las añadimos igual
}
}
rownames(covs2) <- covs2$mrna_idFiltrar la matriz Mnxm a Mnxm’, con m’ << n para filtrar los predictores, en este caso de la matriz de expresión mat_exp.
venn.plot <- venn.diagram(
x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset_alz),
category.names = c("Matrix Genes", "Geneset Alzheimer"),
filename = NULL,
output = FALSE, # Asegura que no se exporte a un archivo
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorías
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = 25, #posicion de las categoricas
cat.dist = 0.1, #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de línea discontinua
edge.col = "grey", # Color de los bordes
main = "Alzheimer genes in exp matrix",
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)# Scree plot con los datos escalados
var_exp <- pca_KEGG$sdev^2
prop_var_exp <- var_exp / sum(var_exp)
cum_var_exp <- cumsum(prop_var_exp)
df_var_exp <- data.frame(Comp = 1:length(prop_var_exp), VarExp = prop_var_exp)
df_cum_var_exp <- data.frame(Comp = 1:length(cum_var_exp), CumVarExp = cum_var_exp)
ggplot(df_var_exp[1:20,], aes(x = Comp, y = VarExp)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_line(aes(group = 1), color = "blue") +
geom_point(color = "blue") +
theme_minimal() +
labs(x = "Principal components", y = "Variance", title = "Scree Plot scaled") +
ylim(c(0,1)) +
geom_line(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color="#8B1A1A") +
geom_point(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color = "red") +
geom_bar(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), stat = "identity", fill = "red", alpha= 0.25) +
annotate("text", x = 4, y = 0.85, label = "Cumulative Scree Plot", color = "#8B1A1A", size = 4) +
geom_text(data = df_cum_var_exp[seq(0,20,2),], aes(x=Comp, y = CumVarExp +0.04, label = round(CumVarExp, 2)))Selecciono los outliers de las dos primeras PC con el doble de SD.
outlierspc1 <- as.data.frame(pca_KEGG$x[abs(pca_KEGG$x[,1]) > 1.5*(pca_KEGG$sdev[1]),1])
outlierspc2 <- as.data.frame(pca_KEGG$x[abs(pca_KEGG$x[,2]) > 1.5*(pca_KEGG$sdev[2]),2])df <- as.data.frame(pca_KEGG$x)
plot1 <- ggplot(df, aes(x = PC1)) +
geom_density(fill = "#00CED1", alpha = 0.5) +
geom_vline(xintercept = mean(df$PC1) + 1.5*pca_KEGG$sdev[1], linetype = "dashed", color = "blue") +
geom_vline(xintercept = mean(df$PC1) - 1.5*pca_KEGG$sdev[1], linetype = "dashed", color = "blue") +
labs(title = "PC1 density with 2*SD scaled PCA") +
geom_text(data = outlierspc1,
aes(x = outlierspc1[,1], y= 0, label = rownames(outlierspc1)),
vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
geom_rug(data = as.data.frame(pca_KEGG$x), aes(x= PC1, y = 0), color= ifelse(abs(pca_KEGG$x[,1]) > 1.5*(pca_KEGG$sdev[1]), "#8B1A1A", "grey")) +
theme_minimal()
plot2 <- ggplot(df, aes(x = PC2)) +
geom_density(fill = "#00CED1", alpha = 0.5) +
geom_vline(xintercept = mean(df$PC2) + 1.5*pca_KEGG$sdev[2], linetype = "dashed", color = "blue") +
geom_vline(xintercept = mean(df$PC2) - 1.5*pca_KEGG$sdev[2], linetype = "dashed", color = "blue") +
labs(title = "PC2 density with 2*SD scaled PCA") +
geom_text(data = outlierspc2,
aes(x = outlierspc2[,1], y= 0, label = rownames(outlierspc2)),
vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
geom_rug(data = as.data.frame(pca_KEGG$x), aes(x= PC2, y = 0), color= ifelse(abs(pca_KEGG$x[,2]) > 1.5*(pca_KEGG$sdev[2]), "#8B1A1A", "grey")) +
theme_minimal()
grid.arrange(plot1, plot2, ncol = 1)mPC1.pos <- rownames(outlierspc1[outlierspc1[, 1] > 0 , , drop = F])
mPC1.neg <- rownames(outlierspc1[outlierspc1[, 1] < 0 , , drop = F ])
mPC2.pos <- rownames(outlierspc2[outlierspc2[, 1] > 0 , , drop = F ])
mPC2.neg <- rownames(outlierspc2[outlierspc2[, 1] < 0 , , drop = F])Tabla outliers PCA
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(mPC1.pos),
length(mPC1.neg),
length(mPC2.pos),
length(mPC2.neg))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1.pos <- c(mPC1.pos, rep("", max_length - length(mPC1.pos)))
rownames_mPC1.neg <- c(mPC1.neg, rep("", max_length - length(mPC1.neg)))
rownames_mPC2.pos <- c(mPC2.pos, rep("", max_length - length(mPC2.pos)))
rownames_mPC2.neg <- c(mPC2.neg, rep("", max_length - length(mPC2.neg)))
final_table <- data.frame(
mPC1.pos = rownames_mPC1.pos,
mPC1.neg = rownames_mPC1.neg,
mPC2.pos = rownames_mPC2.pos,
mPC2.neg = rownames_mPC2.neg
)
final_tableset.seed(1234)
tsne <- Rtsne(mat_exp_alz_genes, dims = 2, theta = 0.0)
tsne.data <- as.data.frame(tsne$Y)
row.names(tsne.data) <- row.names(mat_exp_alz_genes)
tsne.data.covs <- merge(tsne.data, covs, by = "row.names")
tsne.data.covs$Row.names <- NULL
row.names(tsne.data.covs) <- tsne.data.covs$mrna_idrownames(tsne.data) <- rownames(mat_exp_alz_genes)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data[,1])
sd.tsne1 <- sd(tsne.data[,1])
mean.tsne2 <- mean(tsne.data[,2])
sd.tsne2 <- sd(tsne.data[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1 <- tsne.data[abs(tsne.data[,1] - mean.tsne1) > 1.5 * sd.tsne1, ]
outliers.tsne2 <-tsne.data[abs(tsne.data[,2] - mean.tsne2) > 1.5 * sd.tsne2, ]mtSNE1.pos <- rownames(outliers.tsne1[outliers.tsne1[, 1] > 0, , drop = F])
mtSNE1.neg <- rownames(outliers.tsne1[outliers.tsne1[, 1] < 0, , drop = F])
mtSNE2.pos <- rownames(outliers.tsne2[outliers.tsne2[, 1] > 0, , drop = F])
mtSNE2.neg <- rownames(outliers.tsne2[outliers.tsne2[, 1] < 0, , drop = F])Tabla outliers tSNE
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(mtSNE1.pos),
length(mtSNE1.neg),
length(mtSNE2.pos),
length(mtSNE2.neg))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mtSNE1.pos <- c(mtSNE1.pos, rep("", max_length - length(mtSNE1.pos)))
rownames_mtSNE1.neg <- c(mtSNE1.neg, rep("", max_length - length(mtSNE1.neg)))
rownames_mtSNE2.pos <- c(mtSNE2.pos, rep("", max_length - length(mtSNE2.pos)))
rownames_mtSNE2.neg <- c(mtSNE2.neg, rep("", max_length - length(mtSNE2.neg)))
final_table <- data.frame(
mtSNE1.pos = rownames_mtSNE1.pos,
mtSNE1.neg = rownames_mtSNE1.neg,
mtSNE2.pos = rownames_mtSNE2.pos,
mtSNE2.neg = rownames_mtSNE2.neg
)
final_tablelocal.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad <- umap(mat_exp_alz_genes,random_stage=1234, local.config)
umap.data <- as.data.frame(umap.ad$layout)
row.names(umap.data) <- row.names(mat_exp_alz_genes)
umap.data.covs <- merge(umap.data, covs, by = "row.names")
umap.data.covs$Row.names <- NULL
row.names(umap.data.covs) <- umap.data.covs$mrna_idrownames(umap.data) <- rownames(mat_exp_alz_genes)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data[,1])
sd.umap1 <- sd(umap.data[,1])
mean.umap2 <- mean(umap.data[,2])
sd.umap2 <- sd(umap.data[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1 <- umap.data[abs(umap.data[,1] - mean.umap1) > 1.5 * sd.umap1, ]
outliers.umap2 <-umap.data[abs(umap.data[,2] - mean.umap2) > 1.5 * sd.umap2, ]mUMAP1.pos <- rownames(outliers.umap1[outliers.umap1[,1] > 0 , , drop = F])
mUMAP1.neg <- rownames(outliers.umap1[outliers.umap1[,1] < 0 , , drop = F])
mUMAP2.pos <- rownames(outliers.umap2[outliers.umap2[,1] > 0 , , drop = F])
mUMAP2.neg <- rownames(outliers.umap2[outliers.umap2[,1] < 0 , , drop = F])Tabla outliers UMAP
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(mUMAP1.pos),
length(mUMAP1.neg),
length(mUMAP2.pos),
length(mUMAP2.neg))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mtSNE1.pos <- c(mUMAP1.pos, rep("", max_length - length(mUMAP1.pos)))
rownames_mtSNE1.neg <- c(mUMAP1.neg, rep("", max_length - length(mUMAP1.neg)))
rownames_mtSNE2.pos <- c(mUMAP2.pos, rep("", max_length - length(mUMAP2.pos)))
rownames_mtSNE2.neg <- c(mUMAP2.neg, rep("", max_length - length(mUMAP2.neg)))
final_table <- data.frame(
mUMAP1.pos = rownames_mtSNE1.pos,
mUMAP1.neg = rownames_mtSNE1.neg,
mUMAP2.pos = rownames_mtSNE2.pos,
mUMAP2.neg = rownames_mtSNE2.neg
)
final_tableTabla frecuencia outliers SANOS Y ENFERMOS con PCA
# PCA
for (i in rownames(covs2)){
if (i %in% mPC1.pos){
covs2[i, "sampleset_PCA"] <- "mPC1 positive"
}
else if (i %in% mPC1.neg){
covs2[i, "sampleset_PCA"] <- "mPC1 negative"
}
else if (i %in% mPC2.pos ){
covs2[i, "sampleset_PCA"] <- "mPC2 positive"
}
else if (i %in% mPC2.neg){
covs2[i, "sampleset_PCA"] <- "mPC2 negative"
}
else {
covs2[i, "sampleset_PCA"] <- "Not in both"
}
}
covs2$sampleset_PCA <- as.factor(covs2$sampleset_PCA)
data.frame(table(covs2$sampleset_PCA))Tabla frecuencia outliers SOLAMENTE ENFERMOS con PCA
Tabla frecuencia outliers SANOS Y ENFERMOS con tSNE
#tsne
for (i in rownames(covs2)){
if (i %in% mtSNE1.pos){
covs2[i, "sampleset_tSNE"] <- "mtSNE1 positive"
}
else if (i %in% mtSNE1.neg){
covs2[i, "sampleset_tSNE"] <- "mtSNE1 negative"
}
else if (i %in% mtSNE2.pos){
covs2[i, "sampleset_tSNE"] <- "mtSNE2 positive"
}
else if (i %in% mtSNE2.neg){
covs2[i, "sampleset_tSNE"] <- "mtSNE2 negative"
} else {
covs2[i, "sampleset_tSNE"] <- "Not in both"
}
}
covs2$sampleset_tSNE <- as.factor(covs2$sampleset_tSNE)
data.frame(table(covs2$sampleset_tSNE))Tabla frecuencia outliers SOLAMENTE ENFERMOS con tSNE
Tabla frecuencia outliers SANOS Y ENFERMOS con UMAP
# UMAP
for (i in rownames(covs2)){
if (i %in% mUMAP1.pos){
covs2[i, "sampleset_UMAP"] <- "mUMAP1 positive"
}
else if (i %in% mUMAP1.neg){
covs2[i, "sampleset_UMAP"] <- "mUMAP1 negative"
}
else if (i %in% mUMAP2.pos){
covs2[i, "sampleset_UMAP"] <- "mUMAP2 positive"
}
else if (i %in% mUMAP2.neg){
covs2[i, "sampleset_UMAP"] <- "mUMAP2 negative"
} else {
covs2[i, "sampleset_UMAP"] <- "Not in both"
}
}
covs2$sampleset_UMAP <- as.factor(covs2$sampleset_UMAP)
data.frame(table(covs2$sampleset_UMAP))Tabla frecuencia outliers SOLAMENTE ENFERMOS con UMAP
Voy a comparar en una tabla los ratios de las covariables de los outliers comparando con los ratios de covariables de los no outliers
calculo.ratios <- function(df.covs){
ratios <- data.frame()
ratios$ratios <- numeric(0)
for (i in names(df.covs)) {
if (class(covs2[[i]]) == "factor") {
if (grepl("sampleset_", i) | grepl("batch", i)){
next
} else {
frecuencia <- table(df.covs[[i]])
if (length(frecuencia) == 2) {
ratio <- frecuencia[2] / frecuencia[1]
nuevafila <- data.frame(ratios = ratio)
row.names(nuevafila) <- i
ratios <- rbind(ratios, nuevafila)
}
}
}
}
return(ratios)
}data.frame.ratios <- function(df, sampleset) {
niveles.sampleset <- levels(df[[sampleset]])
ratios <- data.frame(matrix(ncol = 0, nrow = length(rownames(calculo.ratios(df)))))
for (i in niveles.sampleset) {
ratio <- df[df[[sampleset]] == i, ] %>%
calculo.ratios()
nombre.filas <- rownames(ratio)
colnames(ratio)[1] <- i
ratios <- cbind(ratios, ratio)
rownames(ratios) <- nombre.filas
}
return(ratios)
}ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.PCA.kegg))))
for (i in colnames(ratios.PCA.kegg)) {
if (i == "Not in both") {
next
} else {
ratio <- data.frame(log2(ratios.PCA.kegg[[i]] / ratios.PCA.kegg[["Not in both"]]))
nombre.filas <- rownames(ratios.PCA.kegg)
colnames(ratio)[1] <- paste(i, " vs No outliers")
ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas
}
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")ratio outliers vs no outliers
calculo.test <- function(df.covs, sampleset){
resultados <- data.frame(
"Covariable" = character(0),
"Principal component" = character(0),
"Minimun expected frequency" = character(0),
"test" = character(0),
"p-value" = numeric(0),
"test 2" = character(0),
"p-value 2" = numeric(0),
"Real_mPC_0" = numeric(0),
"Expected_mPC_0" = numeric(0),
"Real_mPC_1" = numeric(0),
"Expected_mPC_1" = numeric(0),
"Real_NOT_mPC_0" = numeric(0),
"Expected_NOT_mPC_0" = numeric(0),
"Real_NOT_mPC_1" = numeric(0),
"Expected_NOT mPC_1" = numeric(0),
stringsAsFactors = FALSE
)
for (i in names(df.covs)) {
if (i == "neuroStatus") {
next
}
if (class(df.covs[[i]]) == "factor") {
if (grepl("sampleset_", i) | grepl("batch", i)){
next
} else {
for (j in levels(df.covs[[sampleset]])) {
if (j == "Not in both") {
next
} else {
df <- df.covs %>%
mutate(modified.class = ifelse(!!sym(sampleset) == j, j, paste("Not", j))) %>%
dplyr::select(all_of(i), modified.class) %>%
mutate(class = factor(modified.class, levels = c(j, paste("Not", j))))
frecuencia <- table(df[[i]], df[["class"]])
rownames(frecuencia) <- c(paste(i, rownames(frecuencia), sep = "_"))
expected <- chisq.test(frecuencia)$expected
if (any(expected <= 5)) {
# Obtener la posición del valor mínimo
posicion_minimo <- which(frecuencia == min(frecuencia), arr.ind = TRUE)
nombres.columnas <- colnames(frecuencia)
nombres.filas <- rownames(frecuencia)
test <- fisher.test(frecuencia)
test.utilizado <- "Fisher's exact test"
test2 <- chisq.test(frecuencia, correct = TRUE)
test2.utilizado <- "Chi-Square test with Yate's correction"
} else {
test <- chisq.test(frecuencia, correct = TRUE)
test.utilizado <- "Chi-Square test with Yate's correction"
test2 <- chisq.test(matrix(c(1,1,1,1), nrow = 2))
test2.utilizado <- "N/A"
}
nuevafila <- data.frame(
"Covariable" = i,
"Principal component" = j,
"Minimun expected frequency" = round(min(expected), 2),
"test" = test.utilizado,
"p-value" = round(test$p.value, 3),
"test 2" = test2.utilizado,
"p-value 2" = ifelse(test2.utilizado == "N/A", NA, round(test2$p.value, 3)),
"Real_mPC_0" = round(frecuencia[1,1], 2),
"Expected_mPC_0" = round(expected[1,1], 2),
"Real_mPC_1" = round(frecuencia[2,1], 2),
"Expected_mPC_1" = round(expected[2,1], 2),
"Real_NOT_mPC_0" = round(frecuencia[1,2], 2),
"Expected_NOT_mPC_0" = round(expected[1,2], 2),
"Real_NOT_mPC_1" = round(frecuencia[2,2], 2),
"Expected_NOT mPC_1" = round(expected[2,2], 2),
stringsAsFactors = FALSE
)
resultados <- rbind(resultados, nuevafila)
}
}
}
}
}
return(resultados)
}Calculo test estadistico
tests <- calculo.test(df.covs = covs2.casos, sampleset = "sampleset_PCA") %>%
filter(p.value <= 0.1) plot.pvalue.sig <- function(df, test.df) {
plots <- list()
for (i in 1:nrow(test.df)) {
variable <- as.character(test.df[i, 1])
component <- as.character(test.df[i, 2])
Minimun.expected.frequency <- as.character(test.df[i, 3])
test.used <- as.character(test.df[i, 4])
p.value <- as.character(test.df[i, 5])
df.filtered <- df %>%
mutate(modified.class = ifelse(sampleset_PCA == component, component, paste("Not", component))) %>%
dplyr::select(all_of(variable), modified.class) %>%
mutate(class = factor(modified.class, levels = c(component, paste("Not", component))))
p <- ggbarstats(
data = df.filtered,
x = !!sym(variable),
y = class,
xlab = "", title = paste(component, variable),
results.subtitle = FALSE,
subtitle = paste0(test.used, " with p-value = ", p.value, ". \nMinimum expected freq: ", Minimun.expected.frequency))
plots[[paste(variable, component)]] <- p
}
return(plots)
}
plots <- plot.pvalue.sig(df = covs2.casos, test.df = tests)
plot_list <- lapply(plots, ggplotGrob)
grid.arrange(grobs = plot_list, ncol = 2)ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.PCA.kegg))))
for (i in colnames(ratios.PCA.kegg)) {
if (i == "Not in both") {
next
} else {
ratio <- data.frame(log2(ratios.PCA.kegg[[i]] / ratios.PCA.kegg[["Not in both"]]))
nombre.filas <- rownames(ratios.PCA.kegg)
colnames(ratio)[1] <- paste(i, " vs No outliers")
ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas
}
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")ratio outliers vs no outliers
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.umap.kegg))))
for (i in colnames(ratios.umap.kegg)) {
if (i == "Not in both") {
next
} else {
ratio <- data.frame(log2(ratios.umap.kegg[[i]] / ratios.umap.kegg[["Not in both"]]))
nombre.filas <- rownames(ratios.umap.kegg)
colnames(ratio)[1] <- paste(i, " vs No outliers")
ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas
}
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")loadings <- pca_KEGG$rotation
loading.pc1.pos <- loadings[loadings[, "PC1"] > 0, "PC1"]
loading.pc1.neg <- loadings[loadings[, "PC1"] < 0, "PC1"]
loading.pc2.pos <- loadings[loadings[, "PC2"] > 0, "PC2"]
loading.pc2.neg <- loadings[loadings[, "PC2"] < 0, "PC2"]
best.loadings.pc1.pos <- data.frame(loading = head(sort(loading.pc1.pos, decreasing = TRUE), 5))
best.loadings.pc1.neg <- data.frame(loading =head(sort(loading.pc1.neg, decreasing = FALSE), 5), pc = "mPC1 negative")
best.loadings.pc2.pos <- data.frame(loading =head(sort(loading.pc2.pos, decreasing = TRUE), 5), pc = "mPC2 positive")
best.loadings.pc2.neg <- data.frame(loading =head(sort(loading.pc2.neg, decreasing = FALSE), 5), pc = "mPC2 negative")pca.var <- get_pca_var(pca_KEGG)
pca.var.coord <- data.frame(pca.var$coord[,1:2])
pca.var.coord.best.loadings <- pca.var.coord[(rownames(pca.var.coord) %in% rownames(best.loadings.pc1.pos)) | (rownames(pca.var.coord) %in% rownames(best.loadings.pc1.neg)) | (rownames(pca.var.coord) %in% rownames(best.loadings.pc2.pos)) | (rownames(pca.var.coord) %in% rownames(best.loadings.pc2.neg)), ]
pca.var.coord.best.loadings <- pca.var.coord.best.loadings %>%
mutate(PC = case_when(
row.names(.) %in% rownames(best.loadings.pc1.pos) ~ "mPC1 positive",
row.names(.) %in% rownames(best.loadings.pc1.neg) ~ "mPC1 negative",
row.names(.) %in% rownames(best.loadings.pc2.pos) ~ "mPC2 positive",
row.names(.) %in% rownames(best.loadings.pc2.neg) ~ "mPC2 negative",
TRUE ~ NA_character_ # para cualquier otra condición no especificada
))pca.data.kegg <- data.frame(pca_KEGG$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")
covs2 <- merge(covs2, pca.data.kegg, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLpca.data.kegg <- data.frame(pca_KEGG$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")
covs2_df <- covs2.casos %>%
dplyr::select(mrna_id, sampleset_PCA)
para.plot <- merge(pca.data.kegg, covs2_df, by = "row.names")
rownames(para.plot) <- para.plot$Row.names
para.plot$Row.names <- NULL
pca.var.coord.best.loadings$id <- rownames(pca.var.coord.best.loadings)
p <- ggplot(para.plot, aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = "sampleset_PCA")) +
geom_point(show.legend = TRUE, size = 3) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1) | mrna_id %in% rownames(outlierspc2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # Líneas de guía más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = "PCA plot loadings",
x = "PC1.KEGG", y = "PC2.KEGG") +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)) +
geom_point(data= (covs2 %>% filter(neuroStatus == 0)), aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = "sampleset_PCA"), alpha = 0.5)+
geom_segment(data = pca.var.coord.best.loadings, aes(x= 0, y = 0, xend = Dim.1*20, yend = Dim.2*20, color = PC), arrow = arrow(length = unit(0.05, "inches"), type = "closed"),
size = 0.4) +
geom_text_repel(data = pca.var.coord.best.loadings,
aes(x = Dim.1*22,
y = Dim.2*22,
label=ifelse((id %in% rownames(best.loadings.pc1.pos) | id %in% rownames(best.loadings.pc1.neg) | id %in% rownames(best.loadings.pc2.pos) | id %in% rownames(best.loadings.pc2.neg)), id, ""),
color = PC),
max.overlaps = 10, # Reduce el número máximo de solapamientos
size = 3,
fontface = "bold",
segment.color = 'grey50',
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last"))
poutliers.PCA <- c(rownames(outlierspc1), rownames(outlierspc2))
outliers.PCA <- outliers.PCA[outliers.PCA %in% rownames(covs2.casos)]
outliers.tSNE <- c(rownames(outliers.tsne1), rownames(outliers.tsne2))
outliers.tSNE <- outliers.tSNE[outliers.tSNE %in% rownames(covs2.casos)]
outliers.UMAP <- c(rownames(outliers.umap1), rownames(outliers.umap2))
outliers.UMAP <- outliers.UMAP[outliers.UMAP %in% rownames(covs2.casos)]
all.outliers <- list(PCA.KEGG = outliers.PCA,
tSNE.KEGG = outliers.tSNE,
UMAP.KEGG = outliers.UMAP)
max_length <- max(sapply(all.outliers, length))
# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
length(x) <- max_length # Esto rellenará con NA si la lista es más corta que max_length
x
}
# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)
# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)# Generar el diagrama de Venn
venn.plot <- venn.diagram(
x = lista.outliers[1:3],
category.names = c("PCA", "tSNE", "UMAP"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#21908dff', '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorías
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0, 0), #posición de las categorías
cat.dist = 0.05, #distancia de las categorías
rotation.degree = 0,
margin = 0.05, # hacerla más pequeña
lwd = 0.5,
lty = "dashed", # Estilo de línea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn KEGG geneset"),
main.fontface = "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
# Calcular la intersección
interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])
# Dibuja el diagrama de Venn y el texto de la intersección
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)# interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
# grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"),
# x = 0.68,
# y = 0.33,
# gp = gpar(fontsize = 8,
# col = "black",
# fontface = "italic"))
#
# interseccion_AC <- Reduce(intersect, lista.outliers[c(1,3)])
# grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"),
# x = 0.365,
# y = 0.25,
# gp = gpar(fontsize = 8,
# col = "black",
# fontface = "italic"))upset(fromList(lista.outliers),
order.by = "freq",
sets.x.label = "Patients by method",
mainbar.y.label = "Patient intersections",
point.size = 3.5, line.size = 1.5,
text.scale = c(1.3, 1.3, 1, 1, 1.5, 1.3),
empty.intersections = "on",
queries = list(list(query = intersects, params = list("PCA.KEGG",
"tSNE.KEGG", "UMAP.KEGG"), color = "orange", active = T)),
)outliers.PCA <- c(rownames(outlierspc1), rownames(outlierspc2))
outliers.tSNE <- c(rownames(outliers.tsne1), rownames(outliers.tsne2))
outliers.UMAP <- c(rownames(outliers.umap1), rownames(outliers.umap2))
all.outliers <- list(PCA.KEGG = outliers.PCA,
tSNE.KEGG = outliers.tSNE,
UMAP.KEGG = outliers.UMAP)
max_length <- max(sapply(all.outliers, length))
# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
length(x) <- max_length # Esto rellenará con NA si la lista es más corta que max_length
x
}
# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)
# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)# Generar el diagrama de Venn
venn.plot <- venn.diagram(
x = lista.outliers[1:3],
category.names = c("PCA", "tSNE", "UMAP"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#21908dff', '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorías
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0, 0), #posición de las categorías
cat.dist = 0.05, #distancia de las categorías
rotation.degree = 0,
margin = 0.05, # hacerla más pequeña
lwd = 0.5,
lty = "dashed", # Estilo de línea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn KEGG geneset"),
main.fontface = "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
# Calcular la intersección
interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])
# Dibuja el diagrama de Venn y el texto de la intersección
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)# interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
# grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"),
# x = 0.68,
# y = 0.33,
# gp = gpar(fontsize = 8,
# col = "black",
# fontface = "italic"))
#
# interseccion_AC <- Reduce(intersect, lista.outliers[c(1,3)])
# grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"),
# x = 0.365,
# y = 0.25,
# gp = gpar(fontsize = 8,
# col = "black",
# fontface = "italic"))upset(fromList(lista.outliers),
order.by = "freq",
sets.x.label = "Patients by method",
mainbar.y.label = "Patient intersections",
point.size = 3.5, line.size = 1.5,
text.scale = c(1.3, 1.3, 1, 1, 1.5, 1.3),
empty.intersections = "on",
queries = list(list(query = intersects, params = list("PCA.KEGG",
"tSNE.KEGG", "UMAP.KEGG"), color = "orange", active = T)),
)# # Instalación de los paquetes necesarios
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("clusterProfiler", force = T)
# BiocManager::install("org.Hs.eg.db")
# BiocManager::install("enrichplot")
#
# # Cargar los paquetes
# library(clusterProfiler)
# library(org.Hs.eg.db)
# library(enrichplot)
#
# # Lista de genes
# best50.loadings.pc1.pos <- rownames(data.frame(loading = head(sort(loading.pc1.pos, decreasing = TRUE), 50)))
# best50.loadings.pc1.neg <- data.frame(loading =head(sort(loading.pc1.neg, decreasing = FALSE), 5))
# best50.loadings.pc2.pos <- data.frame(loading =head(sort(loading.pc2.pos, decreasing = TRUE), 5))
# best50.loadings.pc2.neg <- data.frame(loading =head(sort(loading.pc2.neg, decreasing = FALSE), 5))
#
# # Preparar los datos de genes
# gene_list <- bitr(best50.loadings.pc1.pos, fromType="ENTREZID", toType=c("ENSEMBL", "SYMBOL"), OrgDb="org.Hs.eg.db")
#
# # Análisis de sobre-representación usando la base de datos KEGG
# ego <- enrichGO(gene = best50.loadings.pc1.pos,
# OrgDb = "org.Hs.eg.db",
# keyType = "SYMBOL",
# ont = "ALL",
# pAdjustMethod = "BH",
# qvalueCutoff = 0.05,
# readable = TRUE)
#
# barplot(ego, showCategory=10)
#
#
# library(org.Hs.eg.db)
#
# # Check if the top gene symbols are valid
# valid_genes <- keys(org.Hs.eg.db, keytype = "SYMBOL")
# head(valid_genes)
#
# # Filter your gene list to include only valid genes
# valid_gene_list <- best50.loadings.pc1.pos[best50.loadings.pc1.pos %in% valid_genes]
#
# # If valid_gene_list is significantly shorter, it indicates many invalid gene symbols.
# length(valid_gene_list)
#
# sorted_valid_gene_list <- sort(best50.loadings.pc1.pos, decreasing = TRUE)
#
#
# gse <- gseGO(geneList= sorted_valid_gene_list,
# ont ="ALL",
# keyType = "SYMBOL",
# pvalueCutoff = 0.05,
# verbose = TRUE,
# OrgDb = "org.Hs.eg.db",
# pAdjustMethod = "none")
#
#
# require(DOSE)
# ridgeplot(gse)